1 Executive Summary

  1. NBA player Salary based on their statistics

2 Introduction

3 Loading and Exploring Data

3.1 Loading libraries required

library(knitr)
library(plyr)
library(dplyr)
library(tidyr)
library(caret)
library(ggplot2)
library(corrplot)
library(stringr)
library(scales)
library(randomForest)
library(psych)
library(glmnet)
library(rpart)
library(lubridate)
library(plotly)
opts_chunk$set(echo = TRUE, cache = TRUE)
opts_chunk$set(tidy.opts = list(width.cutoff = 60), tidy = TRUE, fig.height = 8, fig.width = 12)

3.3 File description

3.3.1 players.csv

Source: https://data.world/datadavis/nba-salaries
It has primary key id, their birth date, birth place, career statistics, name, position

dim(players)
## [1] 4685   24
str(players)
## 'data.frame':    4685 obs. of  24 variables:
##  $ X_id       : chr  "abdelal01" "abdulza01" "abdulka01" "abdulma02" ...
##  $ birthDate  : chr  "June 24, 1968" "April 7, 1946" "April 16, 1947" "March 9, 1969" ...
##  $ birthPlace : chr  "Cairo, Egypt" "Brooklyn, New York" "New York, New York" "Gulfport, Mississippi" ...
##  $ career_AST : num  0.3 1.2 3.6 3.5 1.1 2.5 1.2 1 0.7 0.5 ...
##  $ career_FG. : chr  "50.2" "42.8" "55.9" "44.2" ...
##  $ career_FG3.: chr  "0.0" "" "5.6" "35.4" ...
##  $ career_FT. : chr  "70.1" "72.8" "72.1" "90.5" ...
##  $ career_G   : int  256 505 1560 586 236 830 319 1 56 174 ...
##  $ career_PER : chr  "13.0" "15.1" "24.6" "15.4" ...
##  $ career_PTS : num  5.7 9 24.6 14.6 7.8 18.1 5.6 0 9.5 5.3 ...
##  $ career_TRB : chr  "3.3" "8.0" "11.2" "1.9" ...
##  $ career_WS  : chr  "4.8" "17.5" "273.4" "25.2" ...
##  $ career_eFG.: chr  "50.2" "" "55.9" "47.2" ...
##  $ college    : chr  "Duke University" "Iowa State University" "University of California, Los Angeles" "Louisiana State University" ...
##  $ draft_pick : chr  "25th overall" "5th overall" "1st overall" "3rd overall" ...
##  $ draft_round: chr  "1st round" "1st round" "1st round" "1st round" ...
##  $ draft_team : chr  "Portland Trail Blazers" "Cincinnati Royals" "Milwaukee Bucks" "Denver Nuggets" ...
##  $ draft_year : chr  "1990" "1968" "1969" "1990" ...
##  $ height     : chr  "6-10" "6-9" "7-2" "6-1" ...
##  $ highSchool : chr  "Bloomfield in Bloomfield, New Jersey" "John Jay in Brooklyn, New York" "Power Memorial in New York, New York" "Gulfport in Gulfport, Mississippi" ...
##  $ name       : chr  "Alaa Abdelnaby" "Zaid Abdul-Aziz" "Kareem Abdul-Jabbar" "Mahmoud Abdul-Rauf" ...
##  $ position   : chr  "Power Forward" "Power Forward and Center" "Center" "Point Guard" ...
##  $ shoots     : chr  "Right" "Right" "Right" "Right" ...
##  $ weight     : chr  "240lb" "235lb" "225lb" "162lb" ...

3.3.2 salaries_1985to2018.csv

Source: https://data.world/datadavis/nba-salaries
It record the player_id (as a foreign key to the players.csv), their salary, the season and the teams they played in.

dim(salaries)
## [1] 14163     7
str(salaries)
## 'data.frame':    14163 obs. of  7 variables:
##  $ league      : chr  "NBA" "NBA" "NBA" "NBA" ...
##  $ player_id   : chr  "abdelal01" "abdelal01" "abdelal01" "abdelal01" ...
##  $ salary      : int  395000 494000 500000 805000 650000 1530000 2030000 2000000 3000000 1660000 ...
##  $ season      : chr  "1990-91" "1991-92" "1992-93" "1993-94" ...
##  $ season_end  : int  1991 1992 1993 1994 1995 1985 1986 1988 1989 1991 ...
##  $ season_start: int  1990 1991 1992 1993 1994 1984 1985 1987 1988 1990 ...
##  $ team        : chr  "Portland Trail Blazers" "Portland Trail Blazers" "Boston Celtics" "Boston Celtics" ...

3.3.3 all_seasons.csv

Source: https://www.kaggle.com/datasets/justinas/nba-players-data
It record the player names, teams, age, and other statistics for each individual season from 1996 season to 2020 season. (the index in this table is not linked to the previous tables)

dim(Pstats)
## [1] 11700    22
str(Pstats)
## 'data.frame':    11700 obs. of  22 variables:
##  $ X                : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ player_name      : chr  "Travis Knight" "Matt Fish" "Matt Bullard" "Marty Conlon" ...
##  $ team_abbreviation: chr  "LAL" "MIA" "HOU" "BOS" ...
##  $ age              : num  22 27 30 29 22 22 36 26 33 32 ...
##  $ player_height    : num  213 211 208 211 206 ...
##  $ player_weight    : num  107 107 107 111 107 ...
##  $ college          : chr  "Connecticut" "North Carolina-Wilmington" "Iowa" "Providence" ...
##  $ country          : chr  "USA" "USA" "USA" "USA" ...
##  $ draft_year       : chr  "1996" "1992" "Undrafted" "Undrafted" ...
##  $ draft_round      : chr  "1" "2" "Undrafted" "Undrafted" ...
##  $ draft_number     : chr  "29" "50" "Undrafted" "Undrafted" ...
##  $ gp               : int  71 6 71 74 42 9 70 31 70 82 ...
##  $ pts              : num  4.8 0.3 4.5 7.8 3.7 1.6 3.2 2 11.3 9.9 ...
##  $ reb              : num  4.5 0.8 1.6 4.4 1.6 0.7 2.7 1.2 2.6 4.8 ...
##  $ ast              : num  0.5 0 0.9 1.4 0.5 0.4 0.3 0 4.9 11.4 ...
##  $ net_rating       : num  6.2 -15.1 0.9 -9 -14.5 -3.5 3.5 -17.1 -3.1 -2 ...
##  $ oreb_pct         : num  0.127 0.143 0.016 0.083 0.109 0.087 0.092 0.109 0.023 0.035 ...
##  $ dreb_pct         : num  0.182 0.267 0.115 0.152 0.118 0.045 0.146 0.152 0.088 0.116 ...
##  $ usg_pct          : num  0.142 0.265 0.151 0.167 0.233 0.135 0.137 0.232 0.192 0.155 ...
##  $ ts_pct           : num  0.536 0.333 0.535 0.542 0.482 0.47 0.555 0.448 0.597 0.525 ...
##  $ ast_pct          : num  0.052 0 0.099 0.101 0.114 0.125 0.034 0.013 0.289 0.464 ...
##  $ season           : chr  "1996-97" "1996-97" "1996-97" "1996-97" ...

4 Preprossing data

4.1 Merge data tables

4.1.1 Merging players and salaries

For the data in salaries, player that change team in the middle of the season will receive salary from two teams and thus two entries. I will add up the salary if there is more than one entry for one unique player name in one season.
I will use the team they were in at the end of the season as the Player statistic data set uses the last team they were in.

salaries <- salaries %>%
    group_by(player_id, season_start, season_end, season) %>%
    summarise(salary = sum(salary), team = team[length(team)],
        league = "NBA")
## `summarise()` has grouped output by 'player_id', 'season_start', 'season_end'.
## You can override using the `.groups` argument.

Since it is from the same source and the two tables are linked by a primary key (players.X_id) and foreign key (salaries.player_id). I will link them with merge column X_id and player_id.

player_salary <- merge(players, salaries, by.x = "X_id", by.y = "player_id")
dim(player_salary)
## [1] 13752    30
str(player_salary)
## 'data.frame':    13752 obs. of  30 variables:
##  $ X_id        : chr  "abdelal01" "abdelal01" "abdelal01" "abdelal01" ...
##  $ birthDate   : chr  "June 24, 1968" "June 24, 1968" "June 24, 1968" "June 24, 1968" ...
##  $ birthPlace  : chr  "Cairo, Egypt" "Cairo, Egypt" "Cairo, Egypt" "Cairo, Egypt" ...
##  $ career_AST  : num  0.3 0.3 0.3 0.3 0.3 3.6 3.6 3.6 3.6 3.5 ...
##  $ career_FG.  : chr  "50.2" "50.2" "50.2" "50.2" ...
##  $ career_FG3. : chr  "0.0" "0.0" "0.0" "0.0" ...
##  $ career_FT.  : chr  "70.1" "70.1" "70.1" "70.1" ...
##  $ career_G    : int  256 256 256 256 256 1560 1560 1560 1560 586 ...
##  $ career_PER  : chr  "13.0" "13.0" "13.0" "13.0" ...
##  $ career_PTS  : num  5.7 5.7 5.7 5.7 5.7 24.6 24.6 24.6 24.6 14.6 ...
##  $ career_TRB  : chr  "3.3" "3.3" "3.3" "3.3" ...
##  $ career_WS   : chr  "4.8" "4.8" "4.8" "4.8" ...
##  $ career_eFG. : chr  "50.2" "50.2" "50.2" "50.2" ...
##  $ college     : chr  "Duke University" "Duke University" "Duke University" "Duke University" ...
##  $ draft_pick  : chr  "25th overall" "25th overall" "25th overall" "25th overall" ...
##  $ draft_round : chr  "1st round" "1st round" "1st round" "1st round" ...
##  $ draft_team  : chr  "Portland Trail Blazers" "Portland Trail Blazers" "Portland Trail Blazers" "Portland Trail Blazers" ...
##  $ draft_year  : chr  "1990" "1990" "1990" "1990" ...
##  $ height      : chr  "6-10" "6-10" "6-10" "6-10" ...
##  $ highSchool  : chr  "Bloomfield in Bloomfield, New Jersey" "Bloomfield in Bloomfield, New Jersey" "Bloomfield in Bloomfield, New Jersey" "Bloomfield in Bloomfield, New Jersey" ...
##  $ name        : chr  "Alaa Abdelnaby" "Alaa Abdelnaby" "Alaa Abdelnaby" "Alaa Abdelnaby" ...
##  $ position    : chr  "Power Forward" "Power Forward" "Power Forward" "Power Forward" ...
##  $ shoots      : chr  "Right" "Right" "Right" "Right" ...
##  $ weight      : chr  "240lb" "240lb" "240lb" "240lb" ...
##  $ season_start: int  1990 1991 1992 1993 1994 1984 1985 1987 1988 1990 ...
##  $ season_end  : int  1991 1992 1993 1994 1995 1985 1986 1988 1989 1991 ...
##  $ season      : chr  "1990-91" "1991-92" "1992-93" "1993-94" ...
##  $ salary      : int  395000 494000 500000 805000 650000 1530000 2030000 2000000 3000000 1660000 ...
##  $ team        : chr  "Portland Trail Blazers" "Portland Trail Blazers" "Boston Celtics" "Boston Celtics" ...
##  $ league      : chr  "NBA" "NBA" "NBA" "NBA" ...

4.2 Merging table from two source

teams <- unique(player_salary$team)
teamsABB <- c("POR", "BOS", "SAC", "LAL", "DEN", "VAN", "DAL",
    "ATL", "OKC", "DET", "LAC", "ORL", "HOU", "TOR", "NYK", "BKN",
    "PHX", "NJN", "MEM", "WAS", "CHA", "MIA", "GSW", "CHA", "MIN",
    "SAS", "NOP", "PHI", "WAS", "NOH", "CLE", "MIL", "CHI", "IND",
    "SEA", "UTA", "NOK", "None", "KCK")

teamABBTab <- data.frame(team_name = teams, team_ABB = teamsABB)

player_salary$team_abbreviation <- sapply(player_salary$team,
    function(x) teamABBTab$team_ABB[which(teamABBTab$team_name ==
        x)])
player_salary <- rename(player_salary, player_name = name)

Pstats_salary <- merge(player_salary, Pstats, by = c("player_name",
    "season", "team_abbreviation"))
dim(Pstats_salary)
## [1] 8625   50
str(Pstats_salary)
## 'data.frame':    8625 obs. of  50 variables:
##  $ player_name      : chr  "A.C. Green" "A.C. Green" "A.C. Green" "A.C. Green" ...
##  $ season           : chr  "1996-97" "1997-98" "1998-99" "1999-00" ...
##  $ team_abbreviation: chr  "DAL" "DAL" "DAL" "LAL" ...
##  $ X_id             : chr  "greenac01" "greenac01" "greenac01" "greenac01" ...
##  $ birthDate        : chr  "October 4, 1963" "October 4, 1963" "October 4, 1963" "October 4, 1963" ...
##  $ birthPlace       : chr  "Portland, Oregon" "Portland, Oregon" "Portland, Oregon" "Portland, Oregon" ...
##  $ career_AST       : num  1.1 1.1 1.1 1.1 1.1 0 1.8 1.8 1.8 3 ...
##  $ career_FG.       : chr  "49.4" "49.4" "49.4" "49.4" ...
##  $ career_FG3.      : chr  "25.4" "25.4" "25.4" "25.4" ...
##  $ career_FT.       : chr  "73.4" "73.4" "73.4" "73.4" ...
##  $ career_G         : int  1278 1278 1278 1278 1278 8 80 80 80 645 ...
##  $ career_PER       : chr  "14.4" "14.4" "14.4" "14.4" ...
##  $ career_PTS       : num  9.6 9.6 9.6 9.6 9.6 1 5.5 5.5 5.5 9.7 ...
##  $ career_TRB       : chr  "7.4" "7.4" "7.4" "7.4" ...
##  $ career_WS        : chr  "99.5" "99.5" "99.5" "99.5" ...
##  $ career_eFG.      : chr  "50.1" "50.1" "50.1" "50.1" ...
##  $ college.x        : chr  "Oregon State University" "Oregon State University" "Oregon State University" "Oregon State University" ...
##  $ draft_pick       : chr  "23rd overall" "23rd overall" "23rd overall" "23rd overall" ...
##  $ draft_round.x    : chr  "1st round" "1st round" "1st round" "1st round" ...
##  $ draft_team       : chr  "Los Angeles Lakers" "Los Angeles Lakers" "Los Angeles Lakers" "Los Angeles Lakers" ...
##  $ draft_year.x     : chr  "1985" "1985" "1985" "1985" ...
##  $ height           : chr  "6-9" "6-9" "6-9" "6-9" ...
##  $ highSchool       : chr  "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" ...
##  $ position         : chr  "Power Forward and Small Forward" "Power Forward and Small Forward" "Power Forward and Small Forward" "Power Forward and Small Forward" ...
##  $ shoots           : chr  "Right" "Right" "Right" "Right" ...
##  $ weight           : chr  "220lb" "220lb" "220lb" "220lb" ...
##  $ season_start     : int  1996 1997 1998 1999 2000 1999 2000 2001 2002 2007 ...
##  $ season_end       : int  1997 1998 1999 2000 2001 2000 2001 2002 2003 2008 ...
##  $ salary           : int  4851000 5095088 5125088 1700000 2250000 118974 316969 465850 18748 972720 ...
##  $ team             : chr  "Dallas Mavericks" "Dallas Mavericks" "Dallas Mavericks" "Los Angeles Lakers" ...
##  $ league           : chr  "NBA" "NBA" "NBA" "NBA" ...
##  $ X                : int  300 549 1001 1691 2031 1711 2030 2623 2920 5164 ...
##  $ age              : num  33 34 35 36 37 23 23 24 25 23 ...
##  $ player_height    : num  206 206 206 206 206 ...
##  $ player_weight    : num  102 102 102 102 102 ...
##  $ college.y        : chr  "Oregon State" "Oregon State" "Oregon State" "Oregon State" ...
##  $ country          : chr  "USA" "USA" "USA" "USA" ...
##  $ draft_year.y     : chr  "1985" "1985" "1985" "1985" ...
##  $ draft_round.y    : chr  "1" "1" "1" "1" ...
##  $ draft_number     : chr  "23" "23" "23" "23" ...
##  $ gp               : int  83 82 50 82 82 8 33 45 2 51 ...
##  $ pts              : num  7.2 7.3 4.9 5 4.5 1 6 5.4 0 5.2 ...
##  $ reb              : num  7.9 8.1 4.6 5.9 3.8 2.8 1.1 1 0 1.1 ...
##  $ ast              : num  0.8 1.5 0.5 1 0.5 0 1.9 1.8 1 1.7 ...
##  $ net_rating       : num  -8 -7.2 -5.6 8.1 3.3 -32.6 -12.4 -3.8 -3.9 -0.5 ...
##  $ oreb_pct         : num  0.1 0.09 0.097 0.089 0.089 0.158 0.018 0.022 0 0.026 ...
##  $ dreb_pct         : num  0.207 0.196 0.179 0.179 0.171 0.208 0.053 0.067 0 0.085 ...
##  $ usg_pct          : num  0.119 0.118 0.148 0.111 0.141 0.146 0.169 0.221 0.282 0.224 ...
##  $ ts_pct           : num  0.523 0.496 0.441 0.482 0.492 0.19 0.495 0.477 0 0.535 ...
##  $ ast_pct          : num  0.045 0.074 0.043 0.058 0.05 0 0.198 0.248 0.4 0.249 ...

4.3 saving new data table

write.csv(Pstats_salary, "dataset/all.csv")

4.3.1 Read in the saved table

all <- read.csv("dataset/all.csv")

4.4 Removing repeated or directly correlated variables

dim(all)
## [1] 8625   51
str(all)
## 'data.frame':    8625 obs. of  51 variables:
##  $ X.1              : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ player_name      : chr  "A.C. Green" "A.C. Green" "A.C. Green" "A.C. Green" ...
##  $ season           : chr  "1996-97" "1997-98" "1998-99" "1999-00" ...
##  $ team_abbreviation: chr  "DAL" "DAL" "DAL" "LAL" ...
##  $ X_id             : chr  "greenac01" "greenac01" "greenac01" "greenac01" ...
##  $ birthDate        : chr  "October 4, 1963" "October 4, 1963" "October 4, 1963" "October 4, 1963" ...
##  $ birthPlace       : chr  "Portland, Oregon" "Portland, Oregon" "Portland, Oregon" "Portland, Oregon" ...
##  $ career_AST       : num  1.1 1.1 1.1 1.1 1.1 0 1.8 1.8 1.8 3 ...
##  $ career_FG.       : chr  "49.4" "49.4" "49.4" "49.4" ...
##  $ career_FG3.      : chr  "25.4" "25.4" "25.4" "25.4" ...
##  $ career_FT.       : chr  "73.4" "73.4" "73.4" "73.4" ...
##  $ career_G         : int  1278 1278 1278 1278 1278 8 80 80 80 645 ...
##  $ career_PER       : chr  "14.4" "14.4" "14.4" "14.4" ...
##  $ career_PTS       : num  9.6 9.6 9.6 9.6 9.6 1 5.5 5.5 5.5 9.7 ...
##  $ career_TRB       : num  7.4 7.4 7.4 7.4 7.4 2.8 1 1 1 1.7 ...
##  $ career_WS        : num  99.5 99.5 99.5 99.5 99.5 -0.2 0.3 0.3 0.3 19.2 ...
##  $ career_eFG.      : chr  "50.1" "50.1" "50.1" "50.1" ...
##  $ college.x        : chr  "Oregon State University" "Oregon State University" "Oregon State University" "Oregon State University" ...
##  $ draft_pick       : chr  "23rd overall" "23rd overall" "23rd overall" "23rd overall" ...
##  $ draft_round.x    : chr  "1st round" "1st round" "1st round" "1st round" ...
##  $ draft_team       : chr  "Los Angeles Lakers" "Los Angeles Lakers" "Los Angeles Lakers" "Los Angeles Lakers" ...
##  $ draft_year.x     : int  1985 1985 1985 1985 1985 1999 2000 2000 2000 2007 ...
##  $ height           : chr  "6-9" "6-9" "6-9" "6-9" ...
##  $ highSchool       : chr  "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" ...
##  $ position         : chr  "Power Forward and Small Forward" "Power Forward and Small Forward" "Power Forward and Small Forward" "Power Forward and Small Forward" ...
##  $ shoots           : chr  "Right" "Right" "Right" "Right" ...
##  $ weight           : chr  "220lb" "220lb" "220lb" "220lb" ...
##  $ season_start     : int  1996 1997 1998 1999 2000 1999 2000 2001 2002 2007 ...
##  $ season_end       : int  1997 1998 1999 2000 2001 2000 2001 2002 2003 2008 ...
##  $ salary           : int  4851000 5095088 5125088 1700000 2250000 118974 316969 465850 18748 972720 ...
##  $ team             : chr  "Dallas Mavericks" "Dallas Mavericks" "Dallas Mavericks" "Los Angeles Lakers" ...
##  $ league           : chr  "NBA" "NBA" "NBA" "NBA" ...
##  $ X                : int  300 549 1001 1691 2031 1711 2030 2623 2920 5164 ...
##  $ age              : int  33 34 35 36 37 23 23 24 25 23 ...
##  $ player_height    : num  206 206 206 206 206 ...
##  $ player_weight    : num  102 102 102 102 102 ...
##  $ college.y        : chr  "Oregon State" "Oregon State" "Oregon State" "Oregon State" ...
##  $ country          : chr  "USA" "USA" "USA" "USA" ...
##  $ draft_year.y     : chr  "1985" "1985" "1985" "1985" ...
##  $ draft_round.y    : chr  "1" "1" "1" "1" ...
##  $ draft_number     : chr  "23" "23" "23" "23" ...
##  $ gp               : int  83 82 50 82 82 8 33 45 2 51 ...
##  $ pts              : num  7.2 7.3 4.9 5 4.5 1 6 5.4 0 5.2 ...
##  $ reb              : num  7.9 8.1 4.6 5.9 3.8 2.8 1.1 1 0 1.1 ...
##  $ ast              : num  0.8 1.5 0.5 1 0.5 0 1.9 1.8 1 1.7 ...
##  $ net_rating       : num  -8 -7.2 -5.6 8.1 3.3 -32.6 -12.4 -3.8 -3.9 -0.5 ...
##  $ oreb_pct         : num  0.1 0.09 0.097 0.089 0.089 0.158 0.018 0.022 0 0.026 ...
##  $ dreb_pct         : num  0.207 0.196 0.179 0.179 0.171 0.208 0.053 0.067 0 0.085 ...
##  $ usg_pct          : num  0.119 0.118 0.148 0.111 0.141 0.146 0.169 0.221 0.282 0.224 ...
##  $ ts_pct           : num  0.523 0.496 0.441 0.482 0.492 0.19 0.495 0.477 0 0.535 ...
##  $ ast_pct          : num  0.045 0.074 0.043 0.058 0.05 0 0.198 0.248 0.4 0.249 ...

4.4.1 College

Since college.x and college.y means the same thing (the college they played in before NBA). I will remove college.y and rename college.x as college.

all <- all %>%
    select(!college.y) %>%
    rename(college = college.x)

4.4.2 Height and Weight

4.4.2.1 Height

There are height and player_height, the player_height is in cm while height is in feet. I will first convert feet to inches.

heightinch <- as.numeric(sapply(all$height, function(x) substring(x,
    1, 1))) * 12 + as.numeric(sapply(all$height, function(x) substring(x,
    3, nchar(x))))

height_cor <- cor(heightinch, all$player_height)

height_cor
## [1] 0.994507

Since the height and player_height is highly correlated, I will remove player_height since the imperial unit system (feet and inches) is more widely used in the NBA.

all <- all %>%
    select(!player_height) %>%
    mutate(height = heightinch)

4.4.2.2 Weight

There are player_weight and weight, I will check if there are the same all the time. From the first column, since the difference is about 2.2 times, the player_weight should be in kg while weight is in lb.

weightlb <- as.numeric(sapply(all$weight, function(x) substring(x,
    1, nchar(x) - 2)))

weight_cor <- cor(weightlb, all$player_weight)
weight_cor
## [1] 0.9462468

Since the weight and player_weight is highly correlated (0.9462468), I will remove the player_weight as the imperial unit system (lb) is more widely used in the NBA.

all <- all %>%
    select(!player_weight) %>%
    mutate(weight = weightlb)

4.4.3 Drafting

range(all$draft_year.x[!is.na(all$draft_year.x)])
## [1] 1976 2017

From Wikipedia (https://en.wikipedia.org/wiki/NBA_draft), there are 10 draft rounds from 1976 to 1984, 7 draft rounds from 1985 to 1988, and 2 rounds from 1989 onward.

4.4.3.1 Draft Year

There is repeated draft year: draft_year.x and draft_year.y.

unique(all$draft_year.y[is.na(all$draft_year.x)])
## [1] "Undrafted" "1995"      "1992"      "1999"      "1994"      "1997"     
## [7] "1982"      "2011"
unique(all$draft_year.x[all$draft_year.y == "Undrafted"])
## [1] NA

As some NA in draft_year.x corresponds to some year in draft_year.y while all Undrafted in draft_year.y is NA in draft_year.x, I will remove draft_year.x and rename draft_year.y to draft_year.

all <- all %>%
    select(!draft_year.x) %>%
    rename(draft_year = draft_year.y)

4.4.3.2 Draft round

There are draft_round.x and draft_round.y

unique(all$draft_round.x)
## [1] "1st round" "2nd round" ""          "3rd round" "4th round" "8th round"
## [7] "7th round" "6th round"
unique(all$draft_round.y)
## [1] "1"         "2"         "Undrafted" "3"         "8"         "4"        
## [7] "7"         "6"

I will keep draft_round.y as the data is cleaner (“1” vs “1st round”) and I will change all third round or later to undrafted as there is only first round, second round and undrafted after 1989.

all <- all %>%
    select(!draft_round.x) %>%
    rename(draft_round = draft_round.y)

all$draft_round[which(!(all$draft_round %in% c("1", "2", "Undrafted")))] <- "Undrafted"

4.4.3.3 Draft number and Draft pick

unique(all$draft_number)
##  [1] "23"        "39"        "32"        "26"        "4"         "49"       
##  [7] "Undrafted" "17"        "58"        "11"        "10"        "3"        
## [13] "8"         "42"        "25"        "15"        "14"        "16"       
## [19] "29"        "12"        "60"        "5"         "45"        "20"       
## [25] "31"        "1"         "2"         "50"        "21"        "56"       
## [31] "47"        "9"         "30"        "46"        "35"        "24"       
## [37] "34"        "44"        "28"        "19"        "38"        "52"       
## [43] "53"        "6"         "36"        "27"        "18"        "51"       
## [49] "43"        "7"         "33"        "22"        "40"        "37"       
## [55] "41"        "13"        "48"        "165"       "75"        "55"       
## [61] "57"        "59"        "54"        "79"        "63"        "160"      
## [67] "120"       "127"       "78"
unique(all$draft_pick)
##  [1] "23rd overall"  "39th overall"  "32nd overall"  "26th overall" 
##  [5] "4th overall"   "49th overall"  ""              "17th overall" 
##  [9] "58th overall"  "11th overall"  "10th overall"  "3rd overall"  
## [13] "8th overall"   "42nd overall"  "25th overall"  "15th overall" 
## [17] "14th overall"  "16th overall"  "29th overall"  "12th overall" 
## [21] "60th overall"  "5th overall"   "45th overall"  "20th overall" 
## [25] "31st overall"  "1st overall"   "2nd overall"   "51st overall" 
## [29] "21st overall"  "57th overall"  "47th overall"  "9th overall"  
## [33] "56th overall"  "30th overall"  "46th overall"  "35th overall" 
## [37] "24th overall"  "34th overall"  "44th overall"  "28th overall" 
## [41] "19th overall"  "38th overall"  "52nd overall"  "53rd overall" 
## [45] "6th overall"   "36th overall"  "48th overall"  "27th overall" 
## [49] "77th overall"  "18th overall"  "43rd overall"  "7th overall"  
## [53] "33rd overall"  "22nd overall"  "40th overall"  "37th overall" 
## [57] "13th overall"  "165th overall" "50th overall"  "75th overall" 
## [61] "55th overall"  "41st overall"  "59th overall"  "54th overall" 
## [65] "79th overall"  "63rd overall"  "160th overall" "120th overall"
## [69] "127th overall"
temp_dp <- as.numeric(str_extract(all$draft_pick, "[0-9]+"))
sum(temp_dp != as.numeric(all$draft_number), na.rm = TRUE)
## Warning: NAs introduced by coercion
## [1] 191
kable(head(all[which(temp_dp != as.numeric(all$draft_number)),
    c("X.1", "player_name", "draft_pick", "draft_number", "draft_year",
        "season", "team")], 20))
## Warning in which(temp_dp != as.numeric(all$draft_number)): NAs introduced by
## coercion
X.1 player_name draft_pick draft_number draft_year season team
238 238 Alton Ford 51st overall 50 2001 2001-02 Phoenix Suns
239 239 Alton Ford 51st overall 50 2001 2002-03 Phoenix Suns
240 240 Alton Ford 51st overall 50 2001 2003-04 Houston Rockets
243 243 Alvin Jones 57th overall 56 2001 2001-02 Philadelphia 76ers
622 622 Antonis Fotsis 48th overall 47 2001 2001-02 Memphis Grizzlies
655 655 Arvydas Sabonis 77th overall 24 1986 1996-97 Portland Trail Blazers
656 656 Arvydas Sabonis 77th overall 24 1986 1997-98 Portland Trail Blazers
657 657 Arvydas Sabonis 77th overall 24 1986 1998-99 Portland Trail Blazers
658 658 Arvydas Sabonis 77th overall 24 1986 1999-00 Portland Trail Blazers
659 659 Arvydas Sabonis 77th overall 24 1986 2000-01 Portland Trail Blazers
660 660 Arvydas Sabonis 77th overall 24 1986 2002-03 Portland Trail Blazers
860 860 Bobby Simmons 42nd overall 41 2001 2001-02 Washington Wizards
861 861 Bobby Simmons 42nd overall 41 2001 2002-03 Washington Wizards
862 862 Bobby Simmons 42nd overall 41 2001 2003-04 Los Angeles Clippers
863 863 Bobby Simmons 42nd overall 41 2001 2004-05 Los Angeles Clippers
864 864 Bobby Simmons 42nd overall 41 2001 2005-06 Milwaukee Bucks
865 865 Bobby Simmons 42nd overall 41 2001 2007-08 Milwaukee Bucks
866 866 Bobby Simmons 42nd overall 41 2001 2008-09 New Jersey Nets
867 867 Bobby Simmons 42nd overall 41 2001 2009-10 New Jersey Nets
868 868 Bobby Simmons 42nd overall 41 2001 2010-11 San Antonio Spurs

I manually search up the draft pick and found out the draft_number is mostly correct when their is a difference.

all <- all %>%
    select(!draft_pick)

4.4.4 League

unique(all$league)
## [1] "NBA"

Since there is no variation in the league, I will remove the variable

all <- all %>%
    select(!league)

4.4.5 team

Team and team_abbreviation is directly correlated. I will remove team and keep team_abbreviation

all <- all %>%
    select(!team)

4.4.6 Season

Since season_end, season_start and season represent the same information, I will remove season and season_end.

all <- all %>%
    select(!c(season, season_end))

4.4.7 Career Stats

all <- all[, -grep("^career", names(all))]

5 Exploratory Analysis

dim(all)
## [1] 8625   31
str(all)
## 'data.frame':    8625 obs. of  31 variables:
##  $ X.1              : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ player_name      : chr  "A.C. Green" "A.C. Green" "A.C. Green" "A.C. Green" ...
##  $ team_abbreviation: chr  "DAL" "DAL" "DAL" "LAL" ...
##  $ X_id             : chr  "greenac01" "greenac01" "greenac01" "greenac01" ...
##  $ birthDate        : chr  "October 4, 1963" "October 4, 1963" "October 4, 1963" "October 4, 1963" ...
##  $ birthPlace       : chr  "Portland, Oregon" "Portland, Oregon" "Portland, Oregon" "Portland, Oregon" ...
##  $ college          : chr  "Oregon State University" "Oregon State University" "Oregon State University" "Oregon State University" ...
##  $ draft_team       : chr  "Los Angeles Lakers" "Los Angeles Lakers" "Los Angeles Lakers" "Los Angeles Lakers" ...
##  $ height           : num  81 81 81 81 81 82 73 73 73 72 ...
##  $ highSchool       : chr  "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" "Benson Polytechnic in Portland, Oregon" ...
##  $ position         : chr  "Power Forward and Small Forward" "Power Forward and Small Forward" "Power Forward and Small Forward" "Power Forward and Small Forward" ...
##  $ shoots           : chr  "Right" "Right" "Right" "Right" ...
##  $ weight           : num  220 220 220 220 220 227 180 180 180 161 ...
##  $ season_start     : int  1996 1997 1998 1999 2000 1999 2000 2001 2002 2007 ...
##  $ salary           : int  4851000 5095088 5125088 1700000 2250000 118974 316969 465850 18748 972720 ...
##  $ X                : int  300 549 1001 1691 2031 1711 2030 2623 2920 5164 ...
##  $ age              : int  33 34 35 36 37 23 23 24 25 23 ...
##  $ country          : chr  "USA" "USA" "USA" "USA" ...
##  $ draft_year       : chr  "1985" "1985" "1985" "1985" ...
##  $ draft_round      : chr  "1" "1" "1" "1" ...
##  $ draft_number     : chr  "23" "23" "23" "23" ...
##  $ gp               : int  83 82 50 82 82 8 33 45 2 51 ...
##  $ pts              : num  7.2 7.3 4.9 5 4.5 1 6 5.4 0 5.2 ...
##  $ reb              : num  7.9 8.1 4.6 5.9 3.8 2.8 1.1 1 0 1.1 ...
##  $ ast              : num  0.8 1.5 0.5 1 0.5 0 1.9 1.8 1 1.7 ...
##  $ net_rating       : num  -8 -7.2 -5.6 8.1 3.3 -32.6 -12.4 -3.8 -3.9 -0.5 ...
##  $ oreb_pct         : num  0.1 0.09 0.097 0.089 0.089 0.158 0.018 0.022 0 0.026 ...
##  $ dreb_pct         : num  0.207 0.196 0.179 0.179 0.171 0.208 0.053 0.067 0 0.085 ...
##  $ usg_pct          : num  0.119 0.118 0.148 0.111 0.141 0.146 0.169 0.221 0.282 0.224 ...
##  $ ts_pct           : num  0.523 0.496 0.441 0.482 0.492 0.19 0.495 0.477 0 0.535 ...
##  $ ast_pct          : num  0.045 0.074 0.043 0.058 0.05 0 0.198 0.248 0.4 0.249 ...

5.1 The response variable: salary

5.1.1 Before inflation

sal_year <- all %>%
    group_by(season_start) %>%
    summarise(median = median(salary), mean = mean(salary), sd = sd(salary))

fig1 <- plot_ly(sal_year, x = ~season_start, y = ~median, name = "Median salary",
    type = "scatter", mode = "lines") %>%
    add_trace(y = ~mean, name = "Mean salary") %>%
    layout(title = "Salary in NBA from 1996-97 season to 2017-18 season",
        yaxis = list(title = "Salary (USD)"), xaxis = list(title = "Season"))
fig1
fig2 <- plot_ly(sal_year, x = ~season_start, y = ~sd, name = "standard deviation",
    type = "scatter", mode = "lines") %>%
    layout(title = "Standard Deviation of NBA Salary")
fig2
fig3 <- plot_ly(data = all, x = ~season_start, y = ~salary, type = "box")
fig3

5.1.2 After inflation

inflation <- read.csv("files/inflation.csv")

all <- merge(all, inflation[, c(1, 3)], by.x = "season_start",
    by.y = "year")
all$salary_infl <- all$salary/all$pct
sal_year <- all %>%
    group_by(season_start) %>%
    summarise(median = median(salary_infl), mean = mean(salary_infl),
        sd = sd(salary_infl))

fig1 <- plot_ly(sal_year, x = ~season_start, y = ~median, name = "Median salary",
    type = "scatter", mode = "lines") %>%
    add_trace(y = ~mean, name = "Mean salary") %>%
    layout(title = "Salary in NBA from 1996-97 season to 2017-18 season",
        yaxis = list(title = "Salary (USD)"), xaxis = list(title = "Season"))
fig1
fig2 <- plot_ly(sal_year, x = ~season_start, y = ~sd, name = "standard deviation",
    type = "scatter", mode = "lines") %>%
    layout(title = "Standard Deviation of NBA Salary")
fig2
fig3 <- plot_ly(data = all, x = ~season_start, y = ~salary, type = "box")
fig3

5.2 Decade

I will split year into decades for better visualization

all$decade <- cut(all$season_start, breaks = c(1990, 2000, 2010,
    2020))
levels(all$decade) <- c("1990-2000", "2000-2010", "2010-2020")

plot_ly(data = all, x = ~decade, y = ~salary_infl, type = "box")

5.3 Position

Position:

The position the player play

I will create a variable recording the players’ primary position

pos <- paste0(all$position, " ,") %>%
    str_replace_all(pattern = "and", replacement = "&") %>%
    str_extract(pattern = "^[a-zA-Z ]+(,|&)")
pos <- sapply(as.character(pos), function(x) substring(x, first = 1,
    last = nchar(x) - 2))
all$pri_position <- pos
all$mult_pos <- 0
all$mult_pos[grep("and", all$position)] <- 1
all$pri_position <- factor(all$pri_position, levels = c("Point Guard",
    "Shooting Guard", "Small Forward", "Power Forward", "Center"))
ggplot(all, aes(x = pri_position, y = salary_infl, col = decade)) +
    geom_boxplot()

All position shows a general increasing trend except Center. Although the median of centers’ remain similar in 2010s comparing to 2000s, the upper outliers drop significantly in 2010s. While the median of the small forward remain similar, the upper outlier plummeted during the period ## Important numeric variables

numVar <- which(sapply(all, is.numeric))
numVarName <- names(numVar)

There are 20 numeric variables

all_numVar <- all[, numVar]
all_numVar <- select(all_numVar, !salary)
cor_numVar <- cor(all_numVar, use = "pairwise.complete.obs")

cor_sorted <- as.matrix(sort(cor_numVar[, "salary_infl"], decreasing = TRUE))

cor_high <- row.names(cor_sorted)[1:10]

cor_numVar <- cor_numVar[cor_high, cor_high]

corrplot.mixed(cor_numVar, tl.pos = "lt")

5.3.1 Points

Point:

Average number of points scored

The points and salary correlation for each decade is very similar.

g1 <- ggplot(all, aes(x = pts, y = salary_infl, col = decade)) +
    geom_point() + geom_smooth(col = "black", method = "gam",
    formula = y ~ x) + facet_grid(decade ~ .) + labs(x = "points per game",
    y = "salary after inflation (USD)", title = "point vs salary of each decade")
g1

There is no significant change in points per game in each position

g2 <- ggplot(all, aes(x = as.factor(season_start), y = pts, fill = pri_position)) +
    geom_boxplot() + facet_grid(pri_position ~ .) + labs(x = "Year",
    y = "points per game", title = "points per game throughout the years for each position")
g2

5.3.2 Rebounds

Rebounds:

Average number of rebounds grabbed
g3 <- ggplot(all, aes(x = reb, y = salary_infl, col = pri_position)) +
    geom_point() + geom_smooth(col = "black", method = "gam",
    formula = y ~ x) + facet_grid(pri_position ~ .) + labs(x = "rebounds per game",
    y = "salary after inflation (USD)", title = "rebounds vs salary of each decade")
g3

g4 <- ggplot(all, aes(x = reb, fill = pri_position)) + geom_histogram(binwidth = 1) +
    facet_grid(decade ~ .) + labs(x = "rebounds per game", y = "frequency",
    title = "rebounds per game for each decade") + theme(legend.position = "bottom")
g4

cor_numVar <- cor(all_numVar[which(all$decade == "1990-2000"),
    ], use = "pairwise.complete.obs")

cor_high <- names(sort(cor_numVar[, "salary_infl"], decreasing = TRUE))

cor_numVar2000 <- cor_numVar[cor_high, cor_high]

corrplot.mixed(cor_numVar2000, tl.pos = "lt")

cor_numVar <- cor(all_numVar[which(all$decade == "2000-2010"),
    ], use = "pairwise.complete.obs")

cor_high <- names(sort(cor_numVar[, "salary_infl"], decreasing = TRUE))

cor_numVar1990 <- cor_numVar[cor_high, cor_high]

corrplot.mixed(cor_numVar2000, tl.pos = "lt")

cor_numVar <- cor(all_numVar[which(all$decade == "2010-2020"),
    ], use = "pairwise.complete.obs")

cor_high <- names(sort(cor_numVar[, "salary_infl"], decreasing = TRUE))

cor_numVar1990 <- cor_numVar[cor_high, cor_high]

corrplot.mixed(cor_numVar1990, tl.pos = "lt")